##########################################
## Measurement error:                   ##
## September 15, 2020                   ##
## Nadiya Kostuyk                       ##
##########################################

rm(list=ls())

## Install & load packages (all at once)
list.of.packages <- c('knitr', 'countrycode', 'dplyr','foreign','glmmTMB','lme4',
                      'stargazer')
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]; if(length(new.packages)){install.packages(new.packages,dependencies=TRUE)}
lapply(list.of.packages, require, character.only = TRUE)

# setting directory: 
setwd("C:/Users/nkostyuk3/Dropbox (GaTech)/ResearchProjects/GartzkeKostyuk/Complements v. Substitutes/Publication/Complementarity/JCR/Replication")


#######################################
# loading functions: 
source("measurement_functions.R")


#################
# loading data: 
load('main_data.RData')


data_cyb = data %>%
  filter(TYPE==1)
data_mil = data %>%
  filter(TYPE==-1)  


# simulate data 200 times using the function:
if(!file.exists('seeds.RData')){
  seeds=runif(200)*10000 
  save(seeds, file='seeds.RData')
} else{
  load('seeds.RData') 
}

sim = seq(1, 200, by =1)
s=1; sim[s]
res_sim_list = vector(mode = "list", length = length(sim))
names(res_sim_list) = sim

# creating list of datasets for each simulation:
data_sim_list = vector(mode = "list", length = length(sim))
names(data_sim_list) = sim



####################################################
# 1. Systematic bias:increasing number of cyber events
####################################################

#################################
# 1.1.start with adding COs (randomly)
# # Results for Figure 2 (Appendix)


# simulate data 200 times 

# 1.1.1.: 10 % of cyber operations are randomly distributed:
for(s in 1:length(sim)){
  set.seed(seeds[s])
  data_latent = simulate_extra_events_random(data_cyb, data_mil, p0 = .1)
  #dim(data_latent)
  data_sim_list[[s]] = data_latent

  ##################
  # Refit the model
  mod_sim <- lme4::glmer(OUTCOME~TYPE*(CYBER_LAG_NA+CONVENTIONAL_LAG_NA
                                       +LOG_INT_USERS_A_sc+LOG_INT_USERS_B_sc
                                       +CINC_A_sc)+INV_CAP_DISTANCE_sc+NUCLEAR
                         +(1+TYPE2||DYAD),
                         data=data_latent,
                         family=binomial,
                         control=glmerControl(optimizer="bobyqa",optCtrl=list(maxfun=1e6))
                          )
  res_sim_list[[s]] = mod_sim
  cat(s, 'out of', length(sim), '\n')
}
#save(data_sim_list, file = '')

 
############################################################
# 1.2. simulating COs from the model:

res_sim_list = vector(mode = "list", length = length(sim))
names(res_sim_list) = sim

# creating list of datasets for each simulation:
data_sim_list = vector(mode = "list", length = length(sim))
names(data_sim_list) = sim

# TO DO: remove this model
# loading results from the main model and calling it mod
load('main_wcybr_year.RData')
mod = mod_list[[10]]


#################################
#1.2.1. 10% of under-reporting
# Results for Figure 3 (Appendix)

for(s in 1:length(sim)){
  set.seed(seeds[s])
  data_latent = simulate_extra_events_from_model(data_cyb, data_mil, 
                                      model = mod,
                                       p_base = .1)
  data_sim_list[[s]] = data_latent
  mod_sim <- lme4::glmer(OUTCOME~TYPE*(CYBER_LAG_NA+CONVENTIONAL_LAG_NA
                                       +LOG_INT_USERS_A_sc+LOG_INT_USERS_B_sc
                                       +CINC_A_sc)+INV_CAP_DISTANCE_sc
                         +NUCLEAR
                         +(1+TYPE2||DYAD),
                         data=data_latent,
                         family=binomial,
                         control=glmerControl(optimizer="bobyqa",optCtrl=list(maxfun=1e6)) 
                         )
  res_sim_list[[s]] = mod_sim
  cat(s, 'out of', length(sim), '\n')
}
#save(data_sim_list, file = 'XXX.RData)



###################################################
# 2. Conditional bias: events generated by the model
# predictions (covariates that are a part of the model)
###################################################
  
  
##########################
# 2.1.: Cyber operations 
# are over-reported during 
# conventional conflict
##########################

# 2.1.1.: over-reported by 10% than those that took place with no conflict
#10% more during military conflict than during non-military conflict
# # Results for Figure 4 (Appendix)

res_sim_list = vector(mode = "list", length = length(sim))
names(res_sim_list) = sim

# creating list of datasets for each simulation:
data_sim_list = vector(mode = "list", length = length(sim))
names(data_sim_list) = sim

for(s in 1:length(sim)){
  set.seed(seeds[s])
  data_latent = simulate_extra_events_conditional(data_cyb, data_mil, 
                                                outcome = 'OUTCOME', 
                                                variable = 'CONVENTIONAL',
                                                model = mod, 
                                                p00=.1, 
                                                p01 = .2)
  data_sim_list[[s]] = data_latent
  mod_sim <- lme4::glmer(OUTCOME~TYPE*(CYBER_LAG_NA+CONVENTIONAL_LAG_NA
                    +LOG_INT_USERS_A_sc+LOG_INT_USERS_B_sc
                    +CINC_A_sc)+INV_CAP_DISTANCE_sc
                       +NUCLEAR
                       +(1+TYPE2||DYAD),
                       data=data_latent,
                       family=binomial,
                       control=glmerControl(optimizer="bobyqa",optCtrl=list(maxfun=1e6)) # main, used in the paper
  )
res_sim_list[[s]] = mod_sim
cat(s, 'out of', length(sim), '\n')
}
#save(data_sim_list, file = '')


####################################
# 2.2. Difference in reporting 
# between autocracies and democracies
#####################################
# # Figure 5 (Appendix)

# subsetting BIH because democracy data is missing:
data_aut = data%>%
  filter(!is.na(DEMOCRACY))
dim(data_aut) # 2410 (w 2000 - function removes it later)

data_cyb_aut = data_aut %>%
  filter(TYPE==1)
data_mil_aut = data_aut %>%
  filter(TYPE==-1)

res_sim_list = vector(mode = "list", length = length(sim))
names(res_sim_list) = sim

# creating list of datasets for each simulation:
data_sim_list = vector(mode = "list", length = length(sim))
names(data_sim_list) = sim

for(s in 1:length(sim)){
set.seed(seeds[s])
data_latent = simulate_extra_events_aut(data_cyb_aut, data_mil_aut, 
                                        outcome = 'OUTCOME', 
                                        # more reporting of COs by democracies
                                        p0_aut = .2, p0_dem = .1)
data_sim_list[[s]] = data_latent
mod_sim <- lme4::glmer(OUTCOME~TYPE*(CYBER_LAG_NA+CONVENTIONAL_LAG_NA
                                     +LOG_INT_USERS_A_sc+LOG_INT_USERS_B_sc
                                     +CINC_A_sc)+INV_CAP_DISTANCE_sc#+AGREE_sc
                       +NUCLEAR
                       +(1+TYPE2||DYAD),
                       data=data_latent,
                       family=binomial,
                       control=glmerControl(optimizer="bobyqa",optCtrl=list(maxfun=1e6)) # main, used in the paper
)
res_sim_list[[s]] = mod_sim
cat(s, 'out of', length(sim), '\n')
}
#save(data_sim_list, file = '')


#####################################
# 2.2. Difference in reporting 
# countries (targets of cyber operations)
# with high/low Internet
# reliance
#####################################
# # Results for Figure 6 (Appendix)

res_sim_list = vector(mode = "list", length = length(sim))
names(res_sim_list) = sim

# creating list of datasets for each simulation:
data_sim_list = vector(mode = "list", length = length(sim))
names(data_sim_list) = sim

for(s in 1:length(sim)){
set.seed(seeds[s])
data_latent = simulate_extra_events_conditional(data_cyb, data_mil, 
                                                outcome = 'OUTCOME', 
                                                variable = 'INT_USERS_B_80',
                                                model = mod, 
                                                p00=.1, 
                                                p01 = .2)
data_sim_list[[s]] = data_latent
mod_sim <- lme4::glmer(OUTCOME~TYPE*(CYBER_LAG_NA+CONVENTIONAL_LAG_NA
                                     +LOG_INT_USERS_A_sc+LOG_INT_USERS_B_sc
                                     +CINC_A_sc)+INV_CAP_DISTANCE_sc
                       +NUCLEAR
                       +(1+TYPE2||DYAD),
                       data=data_latent,
                       family=binomial,
                       control=glmerControl(optimizer="bobyqa",optCtrl=list(maxfun=1e6)) 
)
res_sim_list[[s]] = mod_sim
cat(s, 'out of', length(sim), '\n')
}
#save(data_sim_list, file = '')


##########################################
# 2.3. English-speaking countries:
# Results for Figure 7 (Appendix)

res_sim_list = vector(mode = "list", length = length(sim))
names(res_sim_list) = sim

# creating list of datasets for each simulation:
data_sim_list = vector(mode = "list", length = length(sim))
names(data_sim_list) = sim

for(s in 1:length(sim)){
set.seed(seeds[s])
data_latent = simulate_extra_events_conditional(data_cyb, data_mil, 
                                                outcome = 'OUTCOME', 
                                                variable = 'ENGLISH',
                                                model = mod, 
                                                p01 = .2)
data_sim_list[[s]] = data_latent
mod_sim <- lme4::glmer(OUTCOME~TYPE*(CYBER_LAG_NA+CONVENTIONAL_LAG_NA
                                     +LOG_INT_USERS_A_sc+LOG_INT_USERS_B_sc
                                     +CINC_A_sc)+INV_CAP_DISTANCE_sc
                       +NUCLEAR
                       +(1+TYPE2||DYAD),
                       data=data_latent,
                       family=binomial,
                       control=glmerControl(optimizer="bobyqa",optCtrl=list(maxfun=1e6)) 
)
res_sim_list[[s]] = mod_sim
cat(s, 'out of', length(sim), '\n')
}
#save(data_sim_list, file = '')


#####################################
# 2.4. US as a target (over-reporting)
# compared to other operations
# Results from Figure 8
#####################################

res_sim_list = vector(mode = "list", length = length(sim))
names(res_sim_list) = sim

# creating list of datasets for each simulation:
data_sim_list = vector(mode = "list", length = length(sim))
names(data_sim_list) = sim

for(s in 1:length(sim)){
set.seed(seeds[s])
data_latent = simulate_extra_events_conditional(data_cyb, data_mil, 
                                                outcome = 'OUTCOME', 
                                                variable = 'US_TARGET',
                                                model = mod, 
                                                p00=.1, 
                                                p01 = .2)
data_sim_list[[s]] = data_latent
mod_sim <- lme4::glmer(OUTCOME~TYPE*(CYBER_LAG_NA+CONVENTIONAL_LAG_NA
                                     +LOG_INT_USERS_A_sc+LOG_INT_USERS_B_sc
                                     +CINC_A_sc)+INV_CAP_DISTANCE_sc
                       +NUCLEAR
                       +(1+TYPE2||DYAD),
                       data=data_latent,
                       family=binomial,
                       control=glmerControl(optimizer="bobyqa",optCtrl=list(maxfun=1e6)) 
)
res_sim_list[[s]] = mod_sim
cat(s, 'out of', length(sim), '\n')
}
#save(data_sim_list, file = '')


#####################################
# 2.5. US as an attacker (under-reporting) - non-US as an attacker (over-reporting)
# compared to other operations
# Results for Figure 9
#####################################

res_sim_list = vector(mode = "list", length = length(sim))
names(res_sim_list) = sim

# creating list of datasets for each simulation:
data_sim_list = vector(mode = "list", length = length(sim))
names(data_sim_list) = sim

for(s in 1:length(sim)){
set.seed(seeds[s])
data_latent = simulate_extra_events_conditional(data_cyb, data_mil, 
                                                outcome = 'OUTCOME', 
                                                variable = 'US_NON_ATTACKER',
                                                model = mod, 
                                                p00=.1, 
                                                p01 = .2)
data_sim_list[[s]] = data_latent
mod_sim <- lme4::glmer(OUTCOME~TYPE*(CYBER_LAG_NA+CONVENTIONAL_LAG_NA
                                     +LOG_INT_USERS_A_sc+LOG_INT_USERS_B_sc
                                     +CINC_A_sc)+INV_CAP_DISTANCE_sc
                       +NUCLEAR
                       +(1+TYPE2||DYAD),
                       data=data_latent,
                       family=binomial,
                       control=glmerControl(optimizer="bobyqa",optCtrl=list(maxfun=1e6)) 
)
res_sim_list[[s]] = mod_sim
cat(s, 'out of', length(sim), '\n')
}
#save(data_sim_list, file = '')



###################################################
# After these results are saved into the folder, 
# use this code to extract FEs from the results and
# to create a dataframe

# set directory where the results are saved
flz = list.files('directory')
m=1; flz[m]

for(m in 1:length(flz)){
load(paste0('directory', flz[m]))

fixef(res_sim_list[[1]])
sim = seq(1, 200, by =1)
s=1; sim[s]
fe = fixef(res_sim_list[[s]])
data_fe = as.data.frame(matrix(
  nrow = length(sim),
  ncol = length(fe)
))
colnames(data_fe) = names(fe)
f=1; length(f)

for(s in 1:length(sim)){
  for(f in 1:length(fe)){
    data_fe[s,f]=ifelse(rownames(data_fe)[s]==sim[s]&colnames(data_fe)[f]==names(fe[f]),
                        fixef(res_sim_list[[s]])[f], NA
    )
  }
  cat(s, 'out of', length(sim), '\n')
} # s in sim
head(data_fe)
tail(data_fe) 
any(is.na(data_fe))

# saving data into a new folder:
save(data_fe, file=paste0('new_directory', flz[m]))
cat(m, 'out of', length(flz), '\n')
} # closing f-loop


####################################
# Creating plots:

flz = list.files('new_directory')
m=1; flz[m]


for(m in 1:length(flz)){
  load(paste0('new_directory', flz[m]))
  
  fe_av = data_fe %>%
    summarise(across(where(is.numeric), ~ mean(.x, na.rm = TRUE)))
  
  # loading main model results:
  load('main_model.RData')
  mod = mod_list[[10]]
  b = fixef(mod)
  v=vcov(mod)
  
  # creating contrasts:
  L <- matrix(0, nrow=17, ncol = length(b))
  L
  colnames(L) = names(b)
  rownames(L) = c('P_cyber_base', 'P_military_base',
                  'OR_cyber_cyb(lag)', 'OR_military_cyb(lag)',
                  'OR_military_mil(lag)', 'OR_cyber_mil(lag)',
                  'OR_conflict_internet(a)','OR_conflict_internet(t)',
                  'OR_conflict_cinc(a)', 'OR_conflict_distance',
                  'OR_conflict_nuclear(t)',
                  'OR_cyber_internet(a)','OR_military_internet(a)',
                  'OR_cyber_internet(t)','OR_military_internet(t)',
                  'OR_cyber_cinc(a)', 'OR_military_cinc(a)') # OR (odds ratio)
  L[1,c('(Intercept)', 'TYPE')] <- c(1,1)
  L[2,c('(Intercept)', 'TYPE')] <- c(1,-1)
  L[3,c('CYBER_LAG_NA', 'TYPE:CYBER_LAG_NA')] <- c(1,1)
  L[4,c('CYBER_LAG_NA', 'TYPE:CYBER_LAG_NA')] <- c(1,-1)
  L[5,c('CONVENTIONAL_LAG_NA', 'TYPE:CONVENTIONAL_LAG_NA')] <- c(1,-1)
  L[6,c('CONVENTIONAL_LAG_NA', 'TYPE:CONVENTIONAL_LAG_NA')] <- c(1,1)
  L[7,c('LOG_INT_USERS_A_sc')] <- 1
  L[8,c('LOG_INT_USERS_B_sc')] <- 1
  L[9,c('CINC_A_sc')] <- 1
  L[10,c('INV_CAP_DISTANCE_sc')] <- 1
  L[11,c('NUCLEAR')] <- 1
  L[12,c('LOG_INT_USERS_A_sc', 'TYPE:LOG_INT_USERS_A_sc')] <- c(1,1)
  L[13,c('LOG_INT_USERS_A_sc', 'TYPE:LOG_INT_USERS_A_sc')] <- c(1,-1)
  L[14,c('LOG_INT_USERS_B_sc', 'TYPE:LOG_INT_USERS_B_sc')] <- c(1,1)
  L[15,c('LOG_INT_USERS_B_sc', 'TYPE:LOG_INT_USERS_B_sc')] <- c(1,-1)
  L[16,c('CINC_A_sc', 'TYPE:CINC_A_sc')] <- c(1,1)
  L[17,c('CINC_A_sc', 'TYPE:CINC_A_sc')] <- c(1,-1)
  L
  res <- L%*%t(data_fe) # 17 by 200
  dim(res)
  range(res[1,])
  
  #########################
  #
  res_mean = apply(res, 1, mean)
  res_sd = apply(res, 1, sd)
  res_lo = res_mean-1.96*res_sd/ncol(res)
  res_up = res_mean+1.96*res_sd/ncol(res)
  res_mean = cbind(res_mean, res_lo, res_up)%>%
    round(2)
  res_mean = res_mean[-c(1:2),]
  res_mean
  
  ###############################################
  # creating two separate plots for non-cyber and cyber indicators:
  
  ##############
  # non-cyber
  res_mean_non_cyber = as.data.frame(res_mean[-c(1:4),])
  target_non_cyber_se = sqrt(diag(L[-c(1:6),]%*%v%*%t(L[-c(1:6),])))
  target_non_cyber = tibble(
    estimates_OR = rownames(L[-c(1:6),]),
    value=as.vector(exp(L[-c(1:6),]%*%b)),
    se = as.vector(target_non_cyber_se),
    lo = as.vector(exp(L[-c(1:6),]%*%b - 1.96*target_non_cyber_se)),
    up = as.vector(exp(L[-c(1:6),]%*%b + 1.96*target_non_cyber_se))
  )
  
  name = gsub(".RData","",sapply(strsplit(flz[m],"_"),"[",4))
  name_perc = sub('%', 'perc', name)
  
  ggplot()+
    geom_point(target_non_cyber, mapping=aes(x=estimates_OR, y=value),
               color = "black", size = 2)+
    geom_errorbar(data=target_non_cyber, mapping=aes(x=estimates_OR, ymin=lo, ymax=up), width=.1, color = 'black')+
    geom_point(res_mean_non_cyber, mapping=aes(x=rownames(res_mean_non_cyber), y=exp(res_mean)), color = 'grey', size = 2)+
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
    theme(axis.text=element_text(size=11))+
     theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
          panel.background = element_blank(), axis.line = element_line(colour = "black"))+
    scale_y_log10()+
    geom_hline(yintercept = 1, lty='dotted')+
    xlab("") + ylab("Estimated values (in odds ratio)")+
    coord_flip()
  ggsave(paste0('new_location/non-cyber_',name_perc,'.png'))

  ##############
  # cyber
  res_mean_cyber = as.data.frame(res_mean[1:4,])
  target_cyber_se = sqrt(diag(L[3:6,]%*%v%*%t(L[3:6,])))
  target_cyber = tibble(
    estimates_OR = rownames(L[3:6,]),
    value=as.vector(exp(L[3:6,]%*%b)),
    se = as.vector(target_cyber_se),
    lo = as.vector(exp(L[3:6,]%*%b - 1.96*target_cyber_se)),
    up = as.vector(exp(L[3:6,]%*%b + 1.96*target_cyber_se))
  )
  
  ggplot()+
    geom_point(target_cyber, mapping=aes(x=estimates_OR, y=value),
               color = "black", size = 2)+
    geom_errorbar(data=target_cyber, mapping=aes(x=estimates_OR, ymin=lo, ymax=up), width=.1, color = 'black')+
    geom_point(res_mean_cyber, mapping=aes(x=rownames(res_mean_cyber),
                                           y=exp(res_mean)), color = 'grey', size = 2)+
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
    # size of axis labels
    theme(axis.text=element_text(size=11))+
    # transparent background
    theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
          panel.background = element_blank(), axis.line = element_line(colour = "black"))+
    scale_y_log10()+
    geom_hline(yintercept = 1, lty='dotted')+
    xlab("") + ylab("Estimated values (in odds ratio)")+
    coord_flip()
  ggsave(paste0('new_location/cyber_',name_perc,'.png'))
  
} # closing m-loop


